import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import plotly.graph_objects as go
# Load the Iris dataset
iris = load_iris()
iris
{'data': array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5. , 3.6, 1.4, 0.2],
[5.4, 3.9, 1.7, 0.4],
[4.6, 3.4, 1.4, 0.3],
[5. , 3.4, 1.5, 0.2],
[4.4, 2.9, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.1],
[5.4, 3.7, 1.5, 0.2],
[4.8, 3.4, 1.6, 0.2],
[4.8, 3. , 1.4, 0.1],
[4.3, 3. , 1.1, 0.1],
[5.8, 4. , 1.2, 0.2],
[5.7, 4.4, 1.5, 0.4],
[5.4, 3.9, 1.3, 0.4],
[5.1, 3.5, 1.4, 0.3],
[5.7, 3.8, 1.7, 0.3],
[5.1, 3.8, 1.5, 0.3],
[5.4, 3.4, 1.7, 0.2],
[5.1, 3.7, 1.5, 0.4],
[4.6, 3.6, 1. , 0.2],
[5.1, 3.3, 1.7, 0.5],
[4.8, 3.4, 1.9, 0.2],
[5. , 3. , 1.6, 0.2],
[5. , 3.4, 1.6, 0.4],
[5.2, 3.5, 1.5, 0.2],
[5.2, 3.4, 1.4, 0.2],
[4.7, 3.2, 1.6, 0.2],
[4.8, 3.1, 1.6, 0.2],
[5.4, 3.4, 1.5, 0.4],
[5.2, 4.1, 1.5, 0.1],
[5.5, 4.2, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.2],
[5. , 3.2, 1.2, 0.2],
[5.5, 3.5, 1.3, 0.2],
[4.9, 3.6, 1.4, 0.1],
[4.4, 3. , 1.3, 0.2],
[5.1, 3.4, 1.5, 0.2],
[5. , 3.5, 1.3, 0.3],
[4.5, 2.3, 1.3, 0.3],
[4.4, 3.2, 1.3, 0.2],
[5. , 3.5, 1.6, 0.6],
[5.1, 3.8, 1.9, 0.4],
[4.8, 3. , 1.4, 0.3],
[5.1, 3.8, 1.6, 0.2],
[4.6, 3.2, 1.4, 0.2],
[5.3, 3.7, 1.5, 0.2],
[5. , 3.3, 1.4, 0.2],
[7. , 3.2, 4.7, 1.4],
[6.4, 3.2, 4.5, 1.5],
[6.9, 3.1, 4.9, 1.5],
[5.5, 2.3, 4. , 1.3],
[6.5, 2.8, 4.6, 1.5],
[5.7, 2.8, 4.5, 1.3],
[6.3, 3.3, 4.7, 1.6],
[4.9, 2.4, 3.3, 1. ],
[6.6, 2.9, 4.6, 1.3],
[5.2, 2.7, 3.9, 1.4],
[5. , 2. , 3.5, 1. ],
[5.9, 3. , 4.2, 1.5],
[6. , 2.2, 4. , 1. ],
[6.1, 2.9, 4.7, 1.4],
[5.6, 2.9, 3.6, 1.3],
[6.7, 3.1, 4.4, 1.4],
[5.6, 3. , 4.5, 1.5],
[5.8, 2.7, 4.1, 1. ],
[6.2, 2.2, 4.5, 1.5],
[5.6, 2.5, 3.9, 1.1],
[5.9, 3.2, 4.8, 1.8],
[6.1, 2.8, 4. , 1.3],
[6.3, 2.5, 4.9, 1.5],
[6.1, 2.8, 4.7, 1.2],
[6.4, 2.9, 4.3, 1.3],
[6.6, 3. , 4.4, 1.4],
[6.8, 2.8, 4.8, 1.4],
[6.7, 3. , 5. , 1.7],
[6. , 2.9, 4.5, 1.5],
[5.7, 2.6, 3.5, 1. ],
[5.5, 2.4, 3.8, 1.1],
[5.5, 2.4, 3.7, 1. ],
[5.8, 2.7, 3.9, 1.2],
[6. , 2.7, 5.1, 1.6],
[5.4, 3. , 4.5, 1.5],
[6. , 3.4, 4.5, 1.6],
[6.7, 3.1, 4.7, 1.5],
[6.3, 2.3, 4.4, 1.3],
[5.6, 3. , 4.1, 1.3],
[5.5, 2.5, 4. , 1.3],
[5.5, 2.6, 4.4, 1.2],
[6.1, 3. , 4.6, 1.4],
[5.8, 2.6, 4. , 1.2],
[5. , 2.3, 3.3, 1. ],
[5.6, 2.7, 4.2, 1.3],
[5.7, 3. , 4.2, 1.2],
[5.7, 2.9, 4.2, 1.3],
[6.2, 2.9, 4.3, 1.3],
[5.1, 2.5, 3. , 1.1],
[5.7, 2.8, 4.1, 1.3],
[6.3, 3.3, 6. , 2.5],
[5.8, 2.7, 5.1, 1.9],
[7.1, 3. , 5.9, 2.1],
[6.3, 2.9, 5.6, 1.8],
[6.5, 3. , 5.8, 2.2],
[7.6, 3. , 6.6, 2.1],
[4.9, 2.5, 4.5, 1.7],
[7.3, 2.9, 6.3, 1.8],
[6.7, 2.5, 5.8, 1.8],
[7.2, 3.6, 6.1, 2.5],
[6.5, 3.2, 5.1, 2. ],
[6.4, 2.7, 5.3, 1.9],
[6.8, 3. , 5.5, 2.1],
[5.7, 2.5, 5. , 2. ],
[5.8, 2.8, 5.1, 2.4],
[6.4, 3.2, 5.3, 2.3],
[6.5, 3. , 5.5, 1.8],
[7.7, 3.8, 6.7, 2.2],
[7.7, 2.6, 6.9, 2.3],
[6. , 2.2, 5. , 1.5],
[6.9, 3.2, 5.7, 2.3],
[5.6, 2.8, 4.9, 2. ],
[7.7, 2.8, 6.7, 2. ],
[6.3, 2.7, 4.9, 1.8],
[6.7, 3.3, 5.7, 2.1],
[7.2, 3.2, 6. , 1.8],
[6.2, 2.8, 4.8, 1.8],
[6.1, 3. , 4.9, 1.8],
[6.4, 2.8, 5.6, 2.1],
[7.2, 3. , 5.8, 1.6],
[7.4, 2.8, 6.1, 1.9],
[7.9, 3.8, 6.4, 2. ],
[6.4, 2.8, 5.6, 2.2],
[6.3, 2.8, 5.1, 1.5],
[6.1, 2.6, 5.6, 1.4],
[7.7, 3. , 6.1, 2.3],
[6.3, 3.4, 5.6, 2.4],
[6.4, 3.1, 5.5, 1.8],
[6. , 3. , 4.8, 1.8],
[6.9, 3.1, 5.4, 2.1],
[6.7, 3.1, 5.6, 2.4],
[6.9, 3.1, 5.1, 2.3],
[5.8, 2.7, 5.1, 1.9],
[6.8, 3.2, 5.9, 2.3],
[6.7, 3.3, 5.7, 2.5],
[6.7, 3. , 5.2, 2.3],
[6.3, 2.5, 5. , 1.9],
[6.5, 3. , 5.2, 2. ],
[6.2, 3.4, 5.4, 2.3],
[5.9, 3. , 5.1, 1.8]]),
'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
'frame': None,
'target_names': array(['setosa', 'versicolor', 'virginica'], dtype='<U10'),
'DESCR': '.. _iris_dataset:\n\nIris plants dataset\n--------------------\n\n**Data Set Characteristics:**\n\n :Number of Instances: 150 (50 in each of three classes)\n :Number of Attributes: 4 numeric, predictive attributes and the class\n :Attribute Information:\n - sepal length in cm\n - sepal width in cm\n - petal length in cm\n - petal width in cm\n - class:\n - Iris-Setosa\n - Iris-Versicolour\n - Iris-Virginica\n \n :Summary Statistics:\n\n ============== ==== ==== ======= ===== ====================\n Min Max Mean SD Class Correlation\n ============== ==== ==== ======= ===== ====================\n sepal length: 4.3 7.9 5.84 0.83 0.7826\n sepal width: 2.0 4.4 3.05 0.43 -0.4194\n petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\n petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\n ============== ==== ==== ======= ===== ====================\n\n :Missing Attribute Values: None\n :Class Distribution: 33.3% for each of 3 classes.\n :Creator: R.A. Fisher\n :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n :Date: July, 1988\n\nThe famous Iris database, first used by Sir R.A. Fisher. The dataset is taken\nfrom Fisher\'s paper. Note that it\'s the same as in R, but not as in the UCI\nMachine Learning Repository, which has two wrong data points.\n\nThis is perhaps the best known database to be found in the\npattern recognition literature. Fisher\'s paper is a classic in the field and\nis referenced frequently to this day. (See Duda & Hart, for example.) The\ndata set contains 3 classes of 50 instances each, where each class refers to a\ntype of iris plant. One class is linearly separable from the other 2; the\nlatter are NOT linearly separable from each other.\n\n.. topic:: References\n\n - Fisher, R.A. "The use of multiple measurements in taxonomic problems"\n Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to\n Mathematical Statistics" (John Wiley, NY, 1950).\n - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.\n (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\n - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System\n Structure and Classification Rule for Recognition in Partially Exposed\n Environments". IEEE Transactions on Pattern Analysis and Machine\n Intelligence, Vol. PAMI-2, No. 1, 67-71.\n - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions\n on Information Theory, May 1972, 431-433.\n - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II\n conceptual clustering system finds 3 classes in the data.\n - Many, many more ...',
'feature_names': ['sepal length (cm)',
'sepal width (cm)',
'petal length (cm)',
'petal width (cm)'],
'filename': 'iris.csv',
'data_module': 'sklearn.datasets.data'}
# Extract features and target variable
X = iris.data
y = iris.target
# Redefine the target variable into two classes: 'virginica' and 'non-virginica'
y_binary = np.where(y == 2, 'virginica', 'non-virginica')
# Create a DataFrame
iris_df = pd.DataFrame(data=X, columns=iris.feature_names)
iris_df['target'] = y_binary
#creating the separate dataframe for each target variable
virginica_data = iris_df[iris_df['target'] == 'virginica']
non_virginica_data = iris_df[iris_df['target'] == 'non-virginica']
# statistics for each variable
virginica_stats = virginica_data.describe()
non_virginica_stats = non_virginica_data.describe()
# Display the tables
print("Descriptive Statistics for Virginica Class:")
print(virginica_stats)
print("\nDescriptive Statistics for Non-Virginica Class:")
print(non_virginica_stats)
Descriptive Statistics for Virginica Class:
sepal length (cm) sepal width (cm) petal length (cm) \
count 50.00000 50.000000 50.000000
mean 6.58800 2.974000 5.552000
std 0.63588 0.322497 0.551895
min 4.90000 2.200000 4.500000
25% 6.22500 2.800000 5.100000
50% 6.50000 3.000000 5.550000
75% 6.90000 3.175000 5.875000
max 7.90000 3.800000 6.900000
petal width (cm)
count 50.00000
mean 2.02600
std 0.27465
min 1.40000
25% 1.80000
50% 2.00000
75% 2.30000
max 2.50000
Descriptive Statistics for Non-Virginica Class:
sepal length (cm) sepal width (cm) petal length (cm) \
count 100.000000 100.000000 100.000000
mean 5.471000 3.099000 2.861000
std 0.641698 0.478739 1.449549
min 4.300000 2.000000 1.000000
25% 5.000000 2.800000 1.500000
50% 5.400000 3.050000 2.450000
75% 5.900000 3.400000 4.325000
max 7.000000 4.400000 5.100000
petal width (cm)
count 100.000000
mean 0.786000
std 0.565153
min 0.100000
25% 0.200000
50% 0.800000
75% 1.300000
max 1.800000
# Plot histograms for each feature
for feature in iris.feature_names:
plt.figure(figsize=(8, 5))
sns.histplot(data=iris_df, x=feature, hue='target', kde=True)
plt.title(f'Histogram for {feature} in Virginica and Non-Virginica Classes')
plt.show()
In each graph, the histograms allow you to observe the spread and concentration of data points for each feature within the "Virginica" and "Non-Virginica" classes. For Sepal Length, non- virginica is right skewed whereas virginica is left skewed. For sepal width, non-virginica is approximately normalized in form, whereas the virginica show sign of right skewness. For Petal length and petal width does not provide a visual answer, which means there are invidence of NaN or null values.
# Correlation matrix
correlation_matrix = iris_df.corr()
print(correlation_matrix)
sepal length (cm) sepal width (cm) petal length (cm) \
sepal length (cm) 1.000000 -0.117570 0.871754
sepal width (cm) -0.117570 1.000000 -0.428440
petal length (cm) 0.871754 -0.428440 1.000000
petal width (cm) 0.817941 -0.366126 0.962865
petal width (cm)
sepal length (cm) 0.817941
sepal width (cm) -0.366126
petal length (cm) 0.962865
petal width (cm) 1.000000
C:\Users\techv\AppData\Local\Temp\ipykernel_21548\2453095933.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. correlation_matrix = iris_df.corr()
# Plot the correlation matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix of Iris Features')
plt.show()
The above graph provides the relationship with respect to features. For instances:
# Violin plots for all features
features = iris.feature_names
for feature in features:
plt.figure(figsize=(12, 6))
sns.violinplot(x="target", y=feature, data=iris_df, inner="quartile", palette={"virginica": "purple", "non-virginica": "orange"})
plt.title(f"Violin Plot of {feature} by Class")
plt.show()
Violin plots are used to visualize the distribution of a numeric variable across different categories. The violin shape represents the probability density of the data at different values. Wider sections indicate a higher probability density, and narrower sections indicate lower density.
For More Information: Violin Plot
# Plot box plots for each feature with respect to the target variable
for feature in iris.feature_names:
plt.figure(figsize=(12, 6))
sns.boxplot(x='target', y=feature, data=iris_df, palette={"virginica": "purple", "non-virginica": "orange"})
plt.title(f'Box Plot of {feature} by Class')
plt.show()
The above box plots provide a visual representation of the distribution of each feature in the Iris dataset with respect to the target variable, which has two classes: 'virginica' and 'non-virginica'. Points outside the whiskers are considered outliers. It provides insights, into the central tendency, spread, and potential outliers for each feature, helping to understand the distribution of the data
For more Information: Box Plot
# Pair plot
sns.set(style="ticks")
sns.pairplot(iris_df, hue="target", palette={"virginica": "purple", "non-virginica": "orange"})
plt.suptitle("Pair Plot of Iris Features by Class", y=1.02)
plt.show()
C:\Users\techv\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
The pair plot provides scatterplots for each pair of features, colored by class ('virginica' or 'non-virginica'). It helps us visualize relationships between different features and observe potential patterns or separations between classes.
For more information: Pair Plot
# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y_binary, test_size=0.25, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.6, random_state=42)
# Function to train logistics regression models
def train(features):
# Select the specified features
X_train_subset = X_train[:, features]
X_val_subset = X_val[:, features]
X_test_subset = X_test[:, features]
# Train logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_subset, y_train)
# Predictions on the validation set
y_val_pred = model.predict(X_val_subset)
# Calculate accuracy on the validation set
accuracy = accuracy_score(y_val, y_val_pred)
return model, accuracy
# List of feature combinations to try
feature_combinations = [
[0], # One by One feature
[1],
[2],
[3],
[0, 1], # Two features
[0, 2],
[1, 2],
[0, 1, 2], # Three features
]
# Train for each feature combination
for features in feature_combinations:
model, accuracy = train(features)
print(f"Features: {features}, Accuracy: {accuracy:.2f}")
Features: [0], Accuracy: 0.87 Features: [1], Accuracy: 0.60 Features: [2], Accuracy: 1.00 Features: [3], Accuracy: 1.00 Features: [0, 1], Accuracy: 0.93 Features: [0, 2], Accuracy: 1.00 Features: [1, 2], Accuracy: 1.00 Features: [0, 1, 2], Accuracy: 1.00
# Train for each feature combination
for features in feature_combinations:
model, accuracy = train(features)
# Predictions on the validation set
y_val_pred_prob = model.predict_proba(X_val[:, features])[:, 1]
y_val_pred = model.predict(X_val[:, features])
# Create a DataFrame for each model
result_df = pd.DataFrame({
'Instance': range(1, len(X_val) + 1),
'Probability_Virginica': y_val_pred_prob,
'Predicted': y_val_pred,
'Ground Truth': y_val
})
# Display the table
print(f"Table for Features: {features}, Accuracy: {accuracy:.2f}")
print(result_df)
#calculate the precision, recall and f1
precision = precision_score(y_val, y_val_pred, pos_label='virginica', average='binary')
recall = recall_score(y_val, y_val_pred, pos_label='virginica', average='binary')
f1_score_value = f1_score(y_val, y_val_pred, pos_label='virginica', average='binary')
# Summarize the data
precision_classes = precision_score(y_val, y_val_pred, average=None)
recall_classes = recall_score(y_val, y_val_pred, average=None)
f1_classes = f1_score(y_val, y_val_pred, average=None)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1_score_value:.2f}\n")
Table for Features: [0], Accuracy: 0.87
Instance Probability_Virginica Predicted Ground Truth
0 1 0.398271 non-virginica virginica
1 2 0.943719 virginica virginica
2 3 0.045704 non-virginica non-virginica
3 4 0.497836 non-virginica virginica
4 5 0.350992 non-virginica non-virginica
5 6 0.020901 non-virginica non-virginica
6 7 0.548189 virginica virginica
7 8 0.227811 non-virginica non-virginica
8 9 0.597573 virginica virginica
9 10 0.066934 non-virginica non-virginica
10 11 0.227811 non-virginica non-virginica
11 12 0.037659 non-virginica non-virginica
12 13 0.769129 virginica virginica
13 14 0.080708 non-virginica non-virginica
14 15 0.045704 non-virginica non-virginica
Precision: 1.00, Recall: 0.67, F1-Score: 0.80
Table for Features: [1], Accuracy: 0.60
Instance Probability_Virginica Predicted Ground Truth
0 1 0.341824 non-virginica virginica
1 2 0.379518 non-virginica virginica
2 3 0.332684 non-virginica non-virginica
3 4 0.389195 non-virginica virginica
4 5 0.351083 non-virginica non-virginica
5 6 0.323668 non-virginica non-virginica
6 7 0.360456 non-virginica virginica
7 8 0.226581 non-virginica non-virginica
8 9 0.323668 non-virginica virginica
9 10 0.306027 non-virginica non-virginica
10 11 0.360456 non-virginica non-virginica
11 12 0.323668 non-virginica non-virginica
12 13 0.332684 non-virginica virginica
13 14 0.272427 non-virginica non-virginica
14 15 0.341824 non-virginica non-virginica
Precision: 0.00, Recall: 0.00, F1-Score: 0.00
Table for Features: [2], Accuracy: 1.00
Instance Probability_Virginica Predicted Ground Truth
0 1 0.522854 virginica virginica
1 2 0.998485 virginica virginica
2 3 0.000028 non-virginica non-virginica
3 4 0.601438 virginica virginica
4 5 0.233537 non-virginica non-virginica
5 6 0.000011 non-virginica non-virginica
6 7 0.911441 virginica virginica
7 8 0.000021 non-virginica non-virginica
8 9 0.675122 virginica virginica
9 10 0.000028 non-virginica non-virginica
10 11 0.233537 non-virginica non-virginica
11 12 0.000028 non-virginica non-virginica
12 13 0.675122 virginica virginica
13 14 0.000021 non-virginica non-virginica
14 15 0.000015 non-virginica non-virginica
Precision: 1.00, Recall: 1.00, F1-Score: 1.00
Table for Features: [3], Accuracy: 1.00
Instance Probability_Virginica Predicted Ground Truth
0 1 0.639351 virginica virginica
1 2 0.923490 virginica virginica
2 3 0.003812 non-virginica non-virginica
3 4 0.722358 virginica virginica
4 5 0.359309 non-virginica non-virginica
5 6 0.003812 non-virginica non-virginica
6 7 0.848574 virginica virginica
7 8 0.008176 non-virginica non-virginica
8 9 0.792461 virginica virginica
9 10 0.008176 non-virginica non-virginica
10 11 0.206584 non-virginica non-virginica
11 12 0.003812 non-virginica non-virginica
12 13 0.923490 virginica virginica
13 14 0.005585 non-virginica non-virginica
14 15 0.005585 non-virginica non-virginica
Precision: 1.00, Recall: 1.00, F1-Score: 1.00
Table for Features: [0, 1], Accuracy: 0.93
Instance Probability_Virginica Predicted Ground Truth
0 1 0.395505 non-virginica virginica
1 2 0.951534 virginica virginica
2 3 0.043378 non-virginica non-virginica
3 4 0.547310 virginica virginica
4 5 0.357930 non-virginica non-virginica
5 6 0.019006 non-virginica non-virginica
6 7 0.566111 virginica virginica
7 8 0.139412 non-virginica non-virginica
8 9 0.574497 virginica virginica
9 10 0.056509 non-virginica non-virginica
10 11 0.240730 non-virginica non-virginica
11 12 0.034306 non-virginica non-virginica
12 13 0.759617 virginica virginica
13 14 0.058362 non-virginica non-virginica
14 15 0.045153 non-virginica non-virginica
Precision: 1.00, Recall: 0.83, F1-Score: 0.91
Table for Features: [0, 2], Accuracy: 1.00
Instance Probability_Virginica Predicted Ground Truth
0 1 0.531650 virginica virginica
1 2 0.997731 virginica virginica
2 3 0.000031 non-virginica non-virginica
3 4 0.589951 virginica virginica
4 5 0.235600 non-virginica non-virginica
5 6 0.000014 non-virginica non-virginica
6 7 0.912517 virginica virginica
7 8 0.000014 non-virginica non-virginica
8 9 0.645830 virginica virginica
9 10 0.000028 non-virginica non-virginica
10 11 0.264153 non-virginica non-virginica
11 12 0.000032 non-virginica non-virginica
12 13 0.598082 virginica virginica
13 14 0.000019 non-virginica non-virginica
14 15 0.000016 non-virginica non-virginica
Precision: 1.00, Recall: 1.00, F1-Score: 1.00
Table for Features: [1, 2], Accuracy: 1.00
Instance Probability_Virginica Predicted Ground Truth
0 1 0.510675 virginica virginica
1 2 0.998768 virginica virginica
2 3 0.000022 non-virginica non-virginica
3 4 0.637675 virginica virginica
4 5 0.228453 non-virginica non-virginica
5 6 0.000008 non-virginica non-virginica
6 7 0.916493 virginica virginica
7 8 0.000010 non-virginica non-virginica
8 9 0.648686 virginica virginica
9 10 0.000020 non-virginica non-virginica
10 11 0.235498 non-virginica non-virginica
11 12 0.000021 non-virginica non-virginica
12 13 0.657645 virginica virginica
13 14 0.000012 non-virginica non-virginica
14 15 0.000012 non-virginica non-virginica
Precision: 1.00, Recall: 1.00, F1-Score: 1.00
Table for Features: [0, 1, 2], Accuracy: 1.00
Instance Probability_Virginica Predicted Ground Truth
0 1 0.522413 virginica virginica
1 2 0.998105 virginica virginica
2 3 0.000026 non-virginica non-virginica
3 4 0.616890 virginica virginica
4 5 0.231774 non-virginica non-virginica
5 6 0.000011 non-virginica non-virginica
6 7 0.916058 virginica virginica
7 8 0.000008 non-virginica non-virginica
8 9 0.629051 virginica virginica
9 10 0.000021 non-virginica non-virginica
10 11 0.263068 non-virginica non-virginica
11 12 0.000026 non-virginica non-virginica
12 13 0.591204 virginica virginica
13 14 0.000013 non-virginica non-virginica
14 15 0.000013 non-virginica non-virginica
Precision: 1.00, Recall: 1.00, F1-Score: 1.00
C:\Users\techv\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\techv\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
The result are each feature combinations, while providing the accuracy, precision, recall and f1-score for the first 15 insatnces.
# Create subplots for each feature combination
fig = go.Figure()
for features in feature_combinations:
model, accuracy = train(features)
if len(features) == 1:
x1_range = np.linspace(X_val[:, features[0]].min(), X_val[:, features[0]].max(), 100)
x1, = np.meshgrid(x1_range)
decision_boundary = -model.intercept_ / model.coef_[0] * x1
fig.add_trace(go.Scatter(x=x1_range, y=decision_boundary.flatten(), mode='lines', name=f'Decision Boundary ({features})'))
elif len(features) == 2:
x1_range = np.linspace(X_val[:, features[0]].min(), X_val[:, features[0]].max(), 100)
x2_range = np.linspace(X_val[:, features[1]].min(), X_val[:, features[1]].max(), 100)
x1, x2 = np.meshgrid(x1_range, x2_range)
if len(features) == 2:
decision_boundary = - (model.intercept_ + model.coef_[0][0] * x1 + model.coef_[0][1] * x2) / model.coef_[0][1]
else:
decision_boundary = - (model.intercept_ + model.coef_[0][0] * x1 + model.coef_[0][1] * x2) / model.coef_[0][2]
fig.add_trace(go.Surface(x=x1, y=x2, z=decision_boundary, opacity=0.8, colorscale='Viridis', name=f'Decision Boundary ({features})'))
# Plot scatter points for each class
for class_label, color in zip(['non-virginica', 'virginica'], ['blue', 'red']):
class_indices = (y_val == class_label)
fig.add_trace(go.Scatter3d(
x=X_val[class_indices, 0],
y=X_val[class_indices, 1],
z=X_val[class_indices, 2],
mode='markers',
marker=dict(color=color),
name=class_label
))
# Set layout
fig.update_layout(scene=dict(
xaxis_title=iris.feature_names[0],
yaxis_title=iris.feature_names[1],
zaxis_title=iris.feature_names[2],
))
# Show plot
fig.show()
The surfaces in the plot represent the decision boundaries that separate the two classes ('non-virginica' and 'virginica') based on the logistic regression models. Blue points belong to the 'non-virginica' class, and red points belong to the 'virginica' class. In this case, the decision boundaries appear to be fairly well-defined, which suggests that the algorithm is able to accurately classify the Iris flowers based on these two features.
# Train for each feature combination and analyze confusion matrix
for features in feature_combinations:
model, accuracy = train(features)
# Predictions on the validation set
y_val_pred = model.predict(X_val[:, features])
# Confusion matrix
cm = confusion_matrix(y_val, y_val_pred, labels=['non-virginica', 'virginica'])
# Display confusion matrix
print(f"Confusion Matrix for Features: {features}")
print(pd.DataFrame(cm, index=['Actual Non-Virginica', 'Actual Virginica'], columns=['Predicted Non-Virginica', 'Predicted Virginica']))
# Analyze failure modes
false_positives = X_val[(y_val == 'non-virginica') & (y_val_pred == 'virginica')]
false_negatives = X_val[(y_val == 'virginica') & (y_val_pred == 'non-virginica')]
# Display failure modes
print(f"False Positives for Features: {features}")
print(false_positives)
print(f"False Negatives for Features: {features}")
print(false_negatives)
print("\n")
Confusion Matrix for Features: [0]
Predicted Non-Virginica Predicted Virginica
Actual Non-Virginica 9 0
Actual Virginica 2 4
False Positives for Features: [0]
[]
False Negatives for Features: [0]
[[6.1 3. 4.9 1.8]
[6.3 2.5 5. 1.9]]
Confusion Matrix for Features: [1]
Predicted Non-Virginica Predicted Virginica
Actual Non-Virginica 9 0
Actual Virginica 6 0
False Positives for Features: [1]
[]
False Negatives for Features: [1]
[[6.1 3. 4.9 1.8]
[7.7 2.6 6.9 2.3]
[6.3 2.5 5. 1.9]
[6.4 2.8 5.6 2.1]
[6.5 3.2 5.1 2. ]
[6.9 3.1 5.1 2.3]]
Confusion Matrix for Features: [2]
Predicted Non-Virginica Predicted Virginica
Actual Non-Virginica 9 0
Actual Virginica 0 6
False Positives for Features: [2]
[]
False Negatives for Features: [2]
[]
Confusion Matrix for Features: [3]
Predicted Non-Virginica Predicted Virginica
Actual Non-Virginica 9 0
Actual Virginica 0 6
False Positives for Features: [3]
[]
False Negatives for Features: [3]
[]
Confusion Matrix for Features: [0, 1]
Predicted Non-Virginica Predicted Virginica
Actual Non-Virginica 9 0
Actual Virginica 1 5
False Positives for Features: [0, 1]
[]
False Negatives for Features: [0, 1]
[[6.1 3. 4.9 1.8]]
Confusion Matrix for Features: [0, 2]
Predicted Non-Virginica Predicted Virginica
Actual Non-Virginica 9 0
Actual Virginica 0 6
False Positives for Features: [0, 2]
[]
False Negatives for Features: [0, 2]
[]
Confusion Matrix for Features: [1, 2]
Predicted Non-Virginica Predicted Virginica
Actual Non-Virginica 9 0
Actual Virginica 0 6
False Positives for Features: [1, 2]
[]
False Negatives for Features: [1, 2]
[]
Confusion Matrix for Features: [0, 1, 2]
Predicted Non-Virginica Predicted Virginica
Actual Non-Virginica 9 0
Actual Virginica 0 6
False Positives for Features: [0, 1, 2]
[]
False Negatives for Features: [0, 1, 2]
[]
# find the best model based on accuracy
best_model = None
best_accuracy = 0.0
best_features = None #had to reset because features variable retains the last value of the loop
for features in feature_combinations:
model, accuracy = train(features)
# best model
if accuracy > best_accuracy:
best_model = model
best_accuracy = accuracy
best_features = features
# Display the best model and its accuracy
print(f"Best Model Features: {best_features}, Best Accuracy: {best_accuracy:.2f}")
# Evaluate the best model on the test set
y_test_pred = best_model.predict(X_test[:, best_features])
# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
# Display the test set results
print("\nTest Set Results:")
print(f"Test Set Accuracy: {test_accuracy:.2f}")
# Confusion matrix for the test set
cm_test = confusion_matrix(y_test, y_test_pred, labels=['non-virginica', 'virginica'])
# Display confusion matrix for the test set
print("\nConfusion Matrix for Test Set:")
print(pd.DataFrame(cm_test, index=['Actual Non-Virginica', 'Actual Virginica'], columns=['Predicted Non-Virginica', 'Predicted Virginica']))
Best Model Features: [2], Best Accuracy: 1.00
Test Set Results:
Test Set Accuracy: 1.00
Confusion Matrix for Test Set:
Predicted Non-Virginica Predicted Virginica
Actual Non-Virginica 17 0
Actual Virginica 0 6
The confusion matrix confirms the model's performance: